03 - Revisar la limpieza

Autor/a

Yann Say

Fecha de publicación

2 de enero de 2025

library(cleaningtools)
library(dplyr)

my_raw_dataset <- cleaningtools::cleaningtools_raw_data
my_kobo_survey <- cleaningtools::cleaningtools_survey
my_kobo_choice <- cleaningtools::cleaningtools_choices
my_filled_log <- readxl::read_excel("../inputs/02 - example - cleaning-log-with-kobo - filled.xlsx", sheet = 2)

my_clean_data <- create_clean_data(raw_dataset = my_raw_dataset,
                                   raw_data_uuid_column = "X_uuid",
                                   cleaning_log = my_filled_log, 
                                   cleaning_log_uuid_column = "uuid",
                                   cleaning_log_question_column = "question",
                                   cleaning_log_new_value_column = "new_value",
                                   cleaning_log_change_type_column = "change_type")
my_clean_data2 <- recreate_parent_column(dataset = my_clean_data,
                                         uuid_column = "X_uuid",
                                         kobo_survey = my_kobo_survey,
                                         kobo_choices = my_kobo_choice,
                                         sm_separator = ".", 
                                         cleaning_log_to_append = my_filled_log)

review_others

En el registro de limpieza, algunos valores de texto abierto se cambian a vacío. Algunas preguntas de texto abierto están vinculadas a una lógica condicional, es decir, ¿qué es X? Otro, por favor especifique. En algunos casos, algunos valores deben ser cambiados. En el siguiente ejemplo, se cambio el valor de water_supply_other_neighbourhoods_why* de la encuesta uuid 019bc718-c06a-46b8-bba8-c84f6c6efbd5.

my_filled_log %>% 
  filter(question == "water_supply_other_neighbourhoods_why", 
         change_type == "blank_response")
uuid old_value question issue check_id check_binding change_type new_value enumerator_num
019bc718-c06a-46b8-bba8-c84f6c6efbd5 لا اعلم water_supply_other_neighbourhoods_why recode other NA water_supply_other_neighbourhoods_why / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 blank_response NA 12

En la herramienta KOBO se puede notar una logical condicional basada en la variable water_supply_other_neighbourhoods.

my_kobo_survey %>% 
  filter(name == "water_supply_other_neighbourhoods_why") %>% 
  select(type, name, relevant)
type name relevant
text water_supply_other_neighbourhoods_why selected(\({water_supply_other_neighbourhoods},'somewhat_worse') or selected(\){water_supply_other_neighbourhoods},‘much_worse’)
my_clean_data %>% 
  filter(X_uuid == "019bc718-c06a-46b8-bba8-c84f6c6efbd5") %>% 
  select(water_supply_other_neighbourhoods, water_supply_other_neighbourhoods_why   )
water_supply_other_neighbourhoods water_supply_other_neighbourhoods_why
somewhat_worse NA

¿Debe cambiarse el valor de water_supply_other_neighbourhoods? Depende de la pregunta y de la lógica condicional, pero es importante señalarlos para que se pueda tomar una decisión.

review_other_log <- review_others(dataset = my_clean_data2$data_with_fix_concat,
                                  uuid_column = "X_uuid", 
                                  kobo_survey = my_kobo_survey, 
                                  columns_not_to_check = "consent_telephone_number")
Warning in create_logic_for_other(kobo_survey = kobo_survey,
compare_with_dataset = TRUE, : The following parent names: well_quality,
spring_quality, rainwater_quality, surface_quality, why_not_connected were not
found in the dataset. The function is ignoring them.

review_cleaning

my_deletion_log <- my_clean_data2$cleaning_log %>% 
  filter(change_type == "remove_survey")

my_filled_log_no_deletion <- my_clean_data2$cleaning_log %>% 
  filter(change_type != "remove_survey") %>% 
  filter(!uuid %in% my_deletion_log$uuid)

review_of_cleaning <- review_cleaning(raw_dataset = my_raw_dataset,
                    raw_dataset_uuid_column = "X_uuid", 
                    clean_dataset = my_clean_data2$data_with_fix_concat,
                    clean_dataset_uuid_column = "X_uuid",
                    cleaning_log = my_filled_log_no_deletion, 
                    cleaning_log_uuid_column = "uuid",
                    cleaning_log_question_column = "question",
                    cleaning_log_new_value_column = "new_value",
                    cleaning_log_change_type_column = "change_type", 
                    cleaning_log_old_value_column = "old_value", 
                    deletion_log = my_deletion_log, 
                    deletion_log_uuid_column = "uuid"
                    )
review_of_cleaning
uuid df.question df.change_type df.new_value cl.new_value df.old_value cl.old_value comment

Descargas